Load in packages.
library(ballgown, quietly = TRUE)
##
## Attaching package: 'ballgown'
## The following object is masked from 'package:base':
##
## structure
library(genefilter, quietly = TRUE)
library(dplyr, quietly = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:ballgown':
##
## contains, expr, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(devtools, quietly = TRUE)
library(ggplot2, quietly = TRUE)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ballgown':
##
## expr
library(gplots, quietly = TRUE)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(GenomicRanges, quietly = TRUE)
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
##
## space
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
library(viridis, quietly = TRUE)
library(pheatmap, quietly = TRUE)
Create objects containing the directory of our files.
work.dir <- "/media/cbwdata/workspace/Module6/Module6_Lab/de/ballgown"
pheno.dir <- "ref_only"
Load phenotype data from a file we saved.
pheno_data = read.csv(file = file.path(work.dir,pheno.dir,"carcinoma_vs_normal.csv"))
pheno_data
Load ballgown data structure and save it to a variable “bg”.
How many annotated transcripts does our ballgown data structure contain? 1
bg <- ballgown(samples=as.vector(pheno_data$path),pData=pheno_data)
## Tue Jun 8 22:31:39 2021
## Tue Jun 8 22:31:39 2021: Reading linking tables
## Tue Jun 8 22:31:39 2021: Reading intron data files
## Tue Jun 8 22:31:39 2021: Merging intron data
## Tue Jun 8 22:31:39 2021: Reading exon data files
## Tue Jun 8 22:31:40 2021: Merging exon data
## Tue Jun 8 22:31:40 2021: Reading transcript data files
## Tue Jun 8 22:31:40 2021: Merging transcript data
## Wrapping up the results
## Tue Jun 8 22:31:40 2021
bg
## ballgown instance with 6581 transcripts and 6 samples
Load all attributes including gene name.
bg_table = texpr(bg, 'all')
bg_table
bg_gene_names = unique(bg_table[, 9:10])
bg_gene_names
Save the ballgown object to a file for later use.
save(bg, file = file.path(work.dir,'bg.rda'))
Using the stattest function, we’ll test for differential expression of transcripts. First we’ll do this without filtering the data for low abundance transcripts.
results_transcripts <- stattest(bg, feature = "transcript", covariate = "type", getFC = TRUE, meas = "FPKM")
We can also test for differential expression of whole genes.
results_genes <- stattest(bg, feature = "gene", covariate = "type", getFC = TRUE, meas = "FPKM")
results_genes <- merge(x = results_genes, y = bg_gene_names, by.x = c("id"), by.y = c("gene_id"))
results_genes
Why do we perform multiple test correction? 2
Save a tab delimited file for both the transcript and gene results.
write.table(x = results_transcripts, file = file.path(work.dir,"carcinoma_vs_normal_transcript_results.tsv"),sep="\t", quote = FALSE, row.names = FALSE)
write.table(x = results_genes, file = file.path(work.dir,"carcinoma_vs_normal_gene_results.tsv"),sep="\t", quote = FALSE, row.names = FALSE)
Filter low-abundance genes. Here we remove all transcripts with a variance across the samples of less than 1.
Q: How many transcripts do we lose by filtering? 3
bg_filt <- subset (bg, "rowVars(texpr(bg)) > 1", genomesubset=TRUE)
Load all attributes including gene name.
bg_filt_table = texpr(bg_filt , 'all')
bg_filt_gene_names = unique(bg_filt_table[, 9:10])
Run stattest on the filtered data to test for differential expression.
results_transcripts = stattest(bg_filt, feature = "transcript", covariate = "type", getFC = TRUE, meas = "FPKM")
results_genes = stattest(bg_filt, feature = "gene", covariate = "type", getFC = TRUE, meas = "FPKM")
results_genes = merge(x = results_genes, y = bg_filt_gene_names, by.x = c("id"), by.y = c("gene_id"))
length(which(results_genes$qval < 0.05))
## [1] 1
How many genes are statistically significantly differentially expressed between our tumour and normal samples? 4
Output the filtered list of genes and transcripts and save to tab delimited files.
write.table(x = results_transcripts, file = file.path(work.dir,"carcinoma_vs_normal_transcript_results_filtered.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
write.table(x = results_genes, file = file.path(work.dir,"carcinoma_vs_normal_gene_results_filtered.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
Identify the statistically significant genes with q-value < 0.05.
sig_transcripts <- subset(results_transcripts, results_transcripts$qval<0.05)
sig_genes <- subset(results_genes, results_genes$qval<0.05)
Output the signifant gene results to a pair of tab delimited files.
write.table(x = sig_transcripts, file = file.path(work.dir,"carcinoma_vs_normal_transcript_results_sig.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
write.table(x = sig_genes, file = file.path(work.dir,"carcinoma_vs_normal_gene_results_sig.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
If you want to save all your data from this session:
save.image("save_Module6_Lab_ballgown.RData")
This script was written by:
Malachi Griffith, mgriffit[AT]genome.wustl.edu Obi Griffith, ogriffit[AT]genome.wustl.edu Jason Walker, jason.walker[AT]wustl.edu
McDonneLll Genome Institute, Washington Univerisity School of Medicine R tutorial for CBW - Bioinformatics for Cancer Genomics - RNA Sequence Analysis R tutorial for CSHL - Advanced Sequencing Technologies & Applications
Pull the gene_expression data frame from the ballgown object.
gene_expression = as.data.frame(gexpr(bg))
We’re going to view the expression values for the transcripts of a particular gene symbol of chromosome 9 (i.e. PCA3).
First determine the rows in the data.frame that match PCA3 (AKA ENSG00000225937) then display only those rows of the data.frame. There are two methods given for this.
What’s the difference between the two methods of retrieving the index of ENSG00000225937?5
i <- row.names(gene_expression) == "ENSG00000225937"
#i <- which(row.names(gene_expression) == "ENSG00000225937")
gene_expression[i,]
Load the transcript to gene index from the ballgown object.
transcript_gene_table = indexes(bg)$t2g
transcript_gene_table
Many genes will have only 1 transcript, some genes will have several transcripts. Use the ‘table()’ command to count the number of times each gene symbol occurs (i.e. the # of transcripts that have each gene symbol).
How many genes have 1 transcript? More than one transcript? What is the maximum number of transcripts for a single gene?
counts <- table(transcript_gene_table[,"g_id"])
c_one <- length(which(counts == 1))
c_more_than_one <- length(which(counts > 1))
c_max <- max(counts)
Now use the ‘hist’ command to create a histogram of these counts.
col.pal <- inferno(n = 10, begin = 0.2, end = 0.8)
par(mar = c(4,5,4,2))
hist(x = counts, breaks = 50, col = col.pal[1], xlab = "Transcripts per gene", ylab = "", main = "Distribution of transcript count per gene", yaxt='n', border = FALSE)
axis(side=2, at=axTicks(2), las = 1, labels=formatC(axTicks(2), format="d", big.mark=','))
mtext(text = "Frequency", side = 2, line = 4)
legend_text = c(paste("Genes with one transcript =", prettyNum(x = c_one, big.mark = ",")), paste("Genes with more than one transcript =", c_more_than_one), paste("Max transcripts for single gene = ", c_max))
legend("topright", legend_text, lty=NULL, bty = "n")
par(mar = c(4,5,4,2))
hist(x = counts, breaks = 50, col = col.pal[1], xlab = "Transcripts per gene", ylab = "", main = "Distribution of transcript count per gene", yaxt='n', border = FALSE, ylim = c(0,100))
axis(side=2, at=axTicks(2), las = 1, labels=formatC(axTicks(2), format="d", big.mark=','))
mtext(text = "Frequency", side = 2, line = 4)
legend_text = c(paste("Genes with one transcript =", prettyNum(x = c_one, big.mark = ",")), paste("Genes with more than one transcript =", c_more_than_one), paste("Max transcripts for single gene = ", c_max))
legend("topright", legend_text, lty=NULL, bty = "n")
In this analysis we supplied StringTie with transcript models so the lengths will be those of known transcripts. However, if we had used a de novo transcript discovery mode, this step would give us some idea of how well transcripts were being assembled. If we had a low coverage library, or other problems, we might get short ‘transcripts’ that are actually only pieces of real transcripts.
full_table <- texpr(bg , 'all')
par(mar = c(4,5,4,2))
hist(full_table$length, breaks=50, xlab="Transcript length (bp)", ylab = "", main="Distribution of transcript lengths", col = col.pal[2], yaxt='n', xaxt = 'n', border = FALSE)
axis(side=1, at=axTicks(1), las = 1, labels=formatC(axTicks(1), format="d", big.mark=','))
axis(side=2, at=axTicks(2), las = 1, labels=formatC(axTicks(2), format="d", big.mark=','))
mtext(text = "Frequency", side = 2, line = 4)
Using the functions mean and median, what are the mean and median of the transcript lengths?6
When analysing RNA-seq, we often convert raw read counts to Fragments Per Kilobase of transcript per Million mapped reads (FPKM). This allows us to account for varying transcript length. Here we’ll summarise some of the information pertaining to FPKM.
What are the minimum and maximum FPKM values for a particular library?
min(gene_expression[,"FPKM.carcinoma_C02"])
## [1] 0
max(gene_expression[,"FPKM.carcinoma_C02"])
## [1] 7563.955
range(gene_expression[,"FPKM.carcinoma_C02"])
## [1] 0.000 7563.955
If the below command gives you the maximum FPKM for each library, how might you retrieve the minimum FPKM? 7
sapply(X = gene_expression, FUN = max)
## FPKM.carcinoma_C02 FPKM.carcinoma_C03 FPKM.carcinoma_C06 FPKM.normal_N02
## 7563.955 14228.359 21039.455 8301.529
## FPKM.normal_N03 FPKM.normal_N06
## 12590.622 15177.052
We’re going to be log transforming the data. What happens when you log transform 0?
Define a pseudocount to off-set any zero FPKM values. Do this by grabbing a copy of all data values, coverting 0’s to NA, and calculating the minimum or all non NA values
#zz = fpkm_matrix[,data_columns]
#zz[zz==0] = NA
#min_nonzero = min(zz, na.rm=TRUE)
#min_nonzero
Alternatively just set min value to 1.
min_nonzero=1
We’re going to visualise the FPKM as boxplots. Note that the bold horizontal line on each boxplot is the median
names <- rep(c(2,3,6), 2)
ylab <- expression('log'[2]*'FPKM')
boxplot(log2(gene_expression+min_nonzero), col=col.pal[c(1:3,8:10)], names = names, ylab = ylab, main="Distribution of FPKM", las = 1, border = col.pal[c(1:3,8:10)])
lines(x = c(0.7,3.3), y = c(-3,-3), lwd = 2, xpd = TRUE)
lines(x = c(3.7,6.3), y = c(-3,-3), lwd = 2, xpd = TRUE)
mtext(text = c("Carinoma","Normal"), side = 1, at = c(2,5), line = 3)
Display only the genes that are statistically significantly differentially expressed according to Ballgown.
sig <- which(results_genes$pval<0.05)
results_genes[,"log2fc"] <- log2(results_genes[,"fc"])
breaks <- 50
xlab <- expression('log'[2]*'(Fold change)')
main <- "Distribution of differential expression values"
col <- rep(c(col.pal[3],"lightgrey",col.pal[3]),c(15,20,19))
hist(x = as.numeric(results_genes[sig,"log2fc"]), breaks = breaks, col= col, xlab = xlab, main = main, las = 1, border = FALSE)
abline(v=-2, col="grey20", lwd=1, lty=2)
abline(v=2, col="grey20", lwd=1, lty=2)
legend("topright", "Fold-change > |4|", bty = 'n', fill = col[1], border = NA)
gene_expression[,"carcinoma"] <- apply(gene_expression[,c(1:3)], 1, mean)
gene_expression[,"normal"] <- apply(gene_expression[,c(4:6)], 1, mean)
x <- log2(gene_expression[,"carcinoma"]+min_nonzero)
y <- log2(gene_expression[,"normal"]+min_nonzero)
pch <- 16
cex <- 0.25
xlab <- expression("Carcinoma FPKM (log"[2]*")")
ylab <- expression("Normal FPKM (log"[2]*")")
main <- "Carcinoma vs normal FPKMs"
plot(x = x, y = y, pch = 19, cex = 0.5, xlab = xlab, ylab = ylab, main = main, las = 1, col = "darkgrey")
xsig <- x[sig]
ysig <- y[sig]
points(x = xsig, y = ysig, col = col.pal[4], pch = 19, cex = 1)
legend("topleft", "p < 0.05", col = col.pal[4], pch = 19, bty = 'n')
abline(a = 0, b = 1, col = "grey20", lty = 2)
Get the gene symbols for the top N (according to corrected p-value) and display them on the plot
topn <- order(abs(results_genes[sig,"fc"]), decreasing=TRUE)[1:10]
topn <- order(results_genes[sig,"qval"])[1:10]
plot(x = x, y = y, pch = 19, cex = 0.5, xlab = xlab, ylab = ylab, main = main, las = 1, col = "darkgrey")
xsig <- x[sig]
ysig <- y[sig]
points(x = xsig, y = ysig, col = col.pal[4], pch = 19, cex = 1)
legend("topleft", "p < 0.05", col = col.pal[4], pch = 19, bty = 'n')
abline(a = 0, b = 1, col = "grey20", lty = 2)
text(x[topn], y[topn], results_genes[topn,"gene_name"], col="black", cex=0.75, srt=0)
sigpi <- which(results_genes[,"pval"]<0.05)
sigp <- results_genes[sigpi,]
sigde <- which(abs(sigp[,"log2fc"]) >= 2)
sig_tn_de <- sigp[sigde,]
o = order(sig_tn_de[,"qval"], -abs(sig_tn_de[,"log2fc"]), decreasing=FALSE)
sig_tn_de <- sig_tn_de[o,c("gene_name","id","fc","log2fc","pval","qval")]
sig_tn_de
write.table(sig_tn_de, file = file.path(work.dir,"SigDE_Module6_Lab.txt"), sep="\t", row.names=FALSE, quote=FALSE)
Define custom dist and hclust functions for use with heatmaps.
mydist <- function(c) {dist(c,method="euclidian")}
myclust <- function(c) {hclust(c,method="ward.d2")}
main <- "Significantly Differentially Expressed Genes"
par(cex.main=0.8)
sig_genes <- results_genes[sig,"id"]
sig_gene_names <- results_genes[sig,"gene_name"]
data <- log2(as.matrix(gene_expression[sig_genes,1:6])+1)
annotation_col <- data.frame(Group = pheno_data$type)
annotation_col[,1] <- gsub(pattern = "carcinoma", replacement = "Carcinoma", x = annotation_col[,1])
annotation_col[,1] <- gsub(pattern = "normal", replacement = "Normal", x = annotation_col[,1])
rownames(annotation_col) <- colnames(data)
annotation_colors <- list(Group = c(col.pal[3],col.pal[6]))
names(annotation_colors$Group) <- c("Carcinoma","Normal")
pheatmap(mat = data, hclustfun = myclust, distfun = mydist, na.rm = TRUE, scale = "none", dendrogram = "both", margins = c(6,7), Rowv = TRUE, Colv = TRUE, symbreaks = FALSE, key = TRUE, symkey = FALSE, density.info. = "none", trace = "none", main = main, cexRow = 0.3, cexCol = 1, labRow = sig_gene_names,col = inferno(n = 100), annotation_col = annotation_col, annotation_colors = annotation_colors, border_color = NA, show_rownames = TRUE, show_colnames = FALSE, fontsize_row = 8)
With row scaling we can see which samples have the highest and lowest expression of each gene.
pheatmap(mat = data, hclustfun = myclust, distfun = mydist, na.rm = TRUE, scale = "row", dendrogram = "both", margins = c(6,7), Rowv = TRUE, Colv = TRUE, symbreaks = FALSE, key = TRUE, symkey = FALSE, density.info. = "none", trace = "none", main = main, cexRow = 0.3, cexCol = 1, labRow = sig_gene_names,col = inferno(n = 100), annotation_col = annotation_col, annotation_colors = annotation_colors, border_color = NA, show_rownames = TRUE, show_colnames = FALSE, fontsize_row = 8)
6581↩︎
Each p-value represents a statistical test that a given gene is differentially expressed. When we perform multiple statistical tests, the likelihood of observing a differentially expressed gene by chance. We correct the the p-value based on the number of tests we perform, which gives us a q-value.↩︎
6581-2977=3604↩︎
1↩︎
The first returns a logical class object containing 2241 FALSE and 1 TRUE value. The TRUE value demarks the position of ENSG00000225937. The second returns a numerical object of length 1 containing the position of ENSG00000225937.↩︎
mean(full_table$length) returns 1647.06 and median(full_table$length) returns 872.↩︎
Replace max with min.↩︎